{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# First Data Check\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create entry points to spark\n",
    "try:\n",
    "    sc.stop()\n",
    "except:\n",
    "    pass\n",
    "from pyspark import SparkContext, SparkConf\n",
    "from pyspark.sql import SparkSession\n",
    "sc=SparkContext()\n",
    "spark = SparkSession(sparkContext=sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Data\n",
    "\n",
    "Data source: https://www.kaggle.com/c/titanic/data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n",
      "|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|\n",
      "+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n",
      "|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|\n",
      "|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|\n",
      "|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|\n",
      "|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|\n",
      "|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|\n",
      "+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "titanic = spark.read.csv('../../data/kaggle-titanic-train.csv', header=True, inferSchema=True)\n",
    "titanic.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data type\n",
    "\n",
    "First, we want to check if string and numeric variables are imported as we expect."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- PassengerId: integer (nullable = true)\n",
      " |-- Survived: integer (nullable = true)\n",
      " |-- Pclass: integer (nullable = true)\n",
      " |-- Name: string (nullable = true)\n",
      " |-- Sex: string (nullable = true)\n",
      " |-- Age: double (nullable = true)\n",
      " |-- SibSp: integer (nullable = true)\n",
      " |-- Parch: integer (nullable = true)\n",
      " |-- Ticket: string (nullable = true)\n",
      " |-- Fare: double (nullable = true)\n",
      " |-- Cabin: string (nullable = true)\n",
      " |-- Embarked: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "titanic.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data summary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Number of variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(titanic.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Number of observations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "891"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sumarise columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def describe_columns(df):\n",
    "    for i in df.columns:\n",
    "        print('Column: ' + i)\n",
    "        titanic.select(i).describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column: PassengerId\n",
      "+-------+-----------------+\n",
      "|summary|      PassengerId|\n",
      "+-------+-----------------+\n",
      "|  count|              891|\n",
      "|   mean|            446.0|\n",
      "| stddev|257.3538420152301|\n",
      "|    min|                1|\n",
      "|    max|              891|\n",
      "+-------+-----------------+\n",
      "\n",
      "Column: Survived\n",
      "+-------+-------------------+\n",
      "|summary|           Survived|\n",
      "+-------+-------------------+\n",
      "|  count|                891|\n",
      "|   mean| 0.3838383838383838|\n",
      "| stddev|0.48659245426485753|\n",
      "|    min|                  0|\n",
      "|    max|                  1|\n",
      "+-------+-------------------+\n",
      "\n",
      "Column: Pclass\n",
      "+-------+------------------+\n",
      "|summary|            Pclass|\n",
      "+-------+------------------+\n",
      "|  count|               891|\n",
      "|   mean| 2.308641975308642|\n",
      "| stddev|0.8360712409770491|\n",
      "|    min|                 1|\n",
      "|    max|                 3|\n",
      "+-------+------------------+\n",
      "\n",
      "Column: Name\n",
      "+-------+--------------------+\n",
      "|summary|                Name|\n",
      "+-------+--------------------+\n",
      "|  count|                 891|\n",
      "|   mean|                null|\n",
      "| stddev|                null|\n",
      "|    min|\"Andersson, Mr. A...|\n",
      "|    max|van Melkebeke, Mr...|\n",
      "+-------+--------------------+\n",
      "\n",
      "Column: Sex\n",
      "+-------+------+\n",
      "|summary|   Sex|\n",
      "+-------+------+\n",
      "|  count|   891|\n",
      "|   mean|  null|\n",
      "| stddev|  null|\n",
      "|    min|female|\n",
      "|    max|  male|\n",
      "+-------+------+\n",
      "\n",
      "Column: Age\n",
      "+-------+------------------+\n",
      "|summary|               Age|\n",
      "+-------+------------------+\n",
      "|  count|               714|\n",
      "|   mean| 29.69911764705882|\n",
      "| stddev|14.526497332334035|\n",
      "|    min|              0.42|\n",
      "|    max|              80.0|\n",
      "+-------+------------------+\n",
      "\n",
      "Column: SibSp\n",
      "+-------+------------------+\n",
      "|summary|             SibSp|\n",
      "+-------+------------------+\n",
      "|  count|               891|\n",
      "|   mean|0.5230078563411896|\n",
      "| stddev|1.1027434322934315|\n",
      "|    min|                 0|\n",
      "|    max|                 8|\n",
      "+-------+------------------+\n",
      "\n",
      "Column: Parch\n",
      "+-------+-------------------+\n",
      "|summary|              Parch|\n",
      "+-------+-------------------+\n",
      "|  count|                891|\n",
      "|   mean|0.38159371492704824|\n",
      "| stddev| 0.8060572211299488|\n",
      "|    min|                  0|\n",
      "|    max|                  6|\n",
      "+-------+-------------------+\n",
      "\n",
      "Column: Ticket\n",
      "+-------+------------------+\n",
      "|summary|            Ticket|\n",
      "+-------+------------------+\n",
      "|  count|               891|\n",
      "|   mean|260318.54916792738|\n",
      "| stddev|471609.26868834975|\n",
      "|    min|            110152|\n",
      "|    max|         WE/P 5735|\n",
      "+-------+------------------+\n",
      "\n",
      "Column: Fare\n",
      "+-------+-----------------+\n",
      "|summary|             Fare|\n",
      "+-------+-----------------+\n",
      "|  count|              891|\n",
      "|   mean| 32.2042079685746|\n",
      "| stddev|49.69342859718089|\n",
      "|    min|              0.0|\n",
      "|    max|         512.3292|\n",
      "+-------+-----------------+\n",
      "\n",
      "Column: Cabin\n",
      "+-------+-----+\n",
      "|summary|Cabin|\n",
      "+-------+-----+\n",
      "|  count|  204|\n",
      "|   mean| null|\n",
      "| stddev| null|\n",
      "|    min|  A10|\n",
      "|    max|    T|\n",
      "+-------+-----+\n",
      "\n",
      "Column: Embarked\n",
      "+-------+--------+\n",
      "|summary|Embarked|\n",
      "+-------+--------+\n",
      "|  count|     889|\n",
      "|   mean|    null|\n",
      "| stddev|    null|\n",
      "|    min|       C|\n",
      "|    max|       S|\n",
      "+-------+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "describe_columns(titanic)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Find columns with missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_missing_values_columns(df):\n",
    "    nrow = df.count()\n",
    "    for v in df.columns:\n",
    "        summary_df = df.select(v).describe()\n",
    "        v_count = int(summary_df.collect()[0][v])\n",
    "        if v_count < nrow:\n",
    "            missing_percentage = (1 - v_count/nrow) * 100\n",
    "            print(\"Total observations: \" + str(nrow) + \"\\n\"\n",
    "                 \"Total observations of \" + v + \": \" + str(v_count) + \"\\n\"\n",
    "                 \"Percentage of missing values: \" + str(missing_percentage) + \"%\" + \"\\n\"\n",
    "                 \"----------------------------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total observations: 891\n",
      "Total observations of Age: 714\n",
      "Percentage of missing values: 19.865319865319865%\n",
      "----------------------------\n",
      "Total observations: 891\n",
      "Total observations of Cabin: 204\n",
      "Percentage of missing values: 77.1043771043771%\n",
      "----------------------------\n",
      "Total observations: 891\n",
      "Total observations of Embarked: 889\n",
      "Percentage of missing values: 0.22446689113355678%\n",
      "----------------------------\n"
     ]
    }
   ],
   "source": [
    "find_missing_values_columns(titanic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
